clear all
set more off
cap log close
*************************************************************************************************
* Program: prepare_analysis_data.do
* Purpose: Prepare and merge data for validity check and regression analyses
* Sections:
*     1. Prepare Validity Check Data: 
*        a) LEMAS, City ACS, LEOKA, and merge them with patrol phone stats 
*        b) clean NYPD zip code officer counts data 
*     2. Prepare NYPD 311 call Data
*     3. Prepare Homicide Data
*     4. Clean and combine arrest data for 6 cities
*     5. Clean and combine stop data for 9 cities
*     6. Merge arrest, stop, police hour data for a subset of cities for regression  
*     7. Merge police hour (all hours/non-work hours) and covariates for regression
* Files Used:
*     1. LEMAS_2016.dta
*     2. acs13-17_places_demographics.dta
*     3. leoka_2017_officers_count.dta
*     4. patrol_phones_stats_by_city.dta
*     5. NYPDFoilOfficerHomeZipData.csv
*     6. 311_Service_Requests_2016_matched_bg.dta
*     7. homicide-data.csv
*     8. homicide-data-Austin.dta
*     9. NYC_homicide_13_15.dta
*     10. Chicago_Arrest_2017.dta
*     11. Adult_Arrests_DC.csv
*     12. Austin_Arrest_2017.dta
*     13. Dallas_Arrest_2017.dta
*     14. LA_Arrest_Data_from_2010_to_2019.csv
*     15. NYPD_Arrests_Data__Historic_.csv
*     16. Denver_police_pedestrian_stops_and_vehicle_stops.csv
*     17. NYPD_sqf-cy2017.dta
*     18. Chicago_stops.csv
*     19. Philly_car_ped_stops.csv
*     20. yg821jf8611_tx_san_antonio_2020_04_01.csv
*     21. tx_houston_2020_04_01.csv
*     22. yg821jf8611_ok_oklahoma_city_2020_04_01.csv
*     23. oh_columbus_2020_04_01.csv
*     24. yg821jf8611_tn_nashville_2020_04_01.csv
*     25. all census block group boundary data in DataRaw/census_bg_boundary
*     26. acs13-17_bg_demographics.dta
*     27. all_bgs_in_21cities.dta
*     27. patrol_time_in_bg.dta
*     28. patrol_time_in_bg_nonwork.dta
* Files Created:
*     1. LEMAS_2016_cities_racial_share.dta
*     2. city_race_acs_13_17.dta
*     3. leoka_2017_officers_count_25cities.dta
*     4. phones_LEMAS_LEOKA_validate.dta
*     5. NYPDFoilOfficerHomeZipData_long.dta
*     6. 311_calls_bg_count_NYC_2016.dta
*     7. bg_homicide_cleaned.dta
*     8. cities_arrest_bg_count.dta
*     9. cities_stop_bg_count.dta
*     10. bg_stop_arrest_per_hour_for_reg.dta
*     11. police_hour_data_for_reg.dta
*     12. police_hour_data_for_reg_nonwork.dta
* NOTES: Need to ssc install 
*        - gtools, outreg2, sutex2, geoinpoly, geonear, reghdfe, binscatter- if not already installed.
**************************************************************************************************

***************************************************************************************************
* Prepare data for validity check 
***************************************************************************************************

*************************
* prepare LEMAS data 
*************************

use "${DataRaw}/LEMAS_2016.dta",clear

gen keep_cities = 0

* missing data for INDIANAPOLIS (only has state police)
replace keep_cities = 1 if AGENCYNAME == "AUSTIN POLICE DEPARTMENT" & CITY == "AUSTIN" & STATE == "TX"
replace keep_cities = 1 if AGENCYNAME == "BOSTON POLICE DEPARTMENT" & CITY == "BOSTON" & STATE == "MA"
replace keep_cities = 1 if AGENCYNAME == "CHARLOTTEMECKLENBURG POLICE DEPARTMENT" & CITY == "CHARLOTTE" & STATE == "NC"
replace keep_cities = 1 if AGENCYNAME == "CHICAGO POLICE DEPARTMENT" & CITY == "CHICAGO" & STATE == "IL"
replace keep_cities = 1 if AGENCYNAME == "COLUMBUS POLICE DEPARTMENT" & CITY == "COLUMBUS" & STATE == "OH"
replace keep_cities = 1 if AGENCYNAME == "DALLAS POLICE DEPARTMENT" & CITY == "DALLAS" & STATE == "TX"
replace keep_cities = 1 if AGENCYNAME == "DENVER POLICE DEPARTMENT" & CITY == "DENVER" & STATE == "CO"
replace keep_cities = 1 if AGENCYNAME == "DETROIT POLICE DEPARTMENT" & CITY == "DETROIT" & STATE == "MI"
replace keep_cities = 1 if AGENCYNAME == "FORT WORTH POLICE DEPARTMENT" & CITY == "FORT WORTH" & STATE == "TX"
replace keep_cities = 1 if AGENCYNAME == "HOUSTON POLICE DEPARTMENT" & CITY == "HOUSTON" & STATE == "TX"
replace keep_cities = 1 if AGENCYNAME == "LOS ANGELES POLICE DEPARTMENT" & CITY == "LOS ANGELES" & STATE == "CA"
replace keep_cities = 1 if AGENCYNAME == "METRO NASHVILLE POLICE DEPARTMENT" & CITY == "NASHVILLE" & STATE == "TN"
replace keep_cities = 1 if AGENCYNAME == "NEW YORK CITY POLICE DEPARTMENT" & CITY == "NEW YORK" & STATE == "NY"
replace keep_cities = 1 if AGENCYNAME == "OKLAHOMA CITY POLICE DEPARTMENT" & CITY == "OKLAHOMA CITY" & STATE == "OK"
replace keep_cities = 1 if AGENCYNAME == "PHILADELPHIA CITY POLICE DEPARTMENT" & CITY == "PHILADELPHIA" & STATE == "PA"
replace keep_cities = 1 if AGENCYNAME == "PHOENIX POLICE DEPARTMENT" & CITY == "PHOENIX" & STATE == "AZ"
replace keep_cities = 1 if AGENCYNAME == "SAN ANTONIO POLICE DEPARTMENT" & CITY == "SAN ANTONIO" & STATE == "TX"
replace keep_cities = 1 if AGENCYNAME == "SAN DIEGO POLICE DEPARTMENT" & CITY == "SAN DIEGO" & STATE == "CA"
replace keep_cities = 1 if AGENCYNAME == "SAN FRANCISCO POLICE DEPARTMENT" & CITY == "SAN FRANCISCO" & STATE == "CA"
replace keep_cities = 1 if AGENCYNAME == "METROPOLITAN POLICE DEPARTMENT, DC" & CITY == "WASHINGTON" & STATE == "DC"

keep if keep_cities == 1

gen white = PERS_WHITE_MALE + PERS_WHITE_FEM
gen black = PERS_BLACK_MALE + PERS_BLACK_FEM
gen hisp = PERS_HISP_MALE + PERS_HISP_FEM
gen asian = PERS_ASIAN_MALE + PERS_ASIAN_FEM

gen police_pct_white = white / FTSWORN
gen police_pct_black = black / FTSWORN
gen police_pct_hisp = hisp / FTSWORN
gen police_pct_asian = asian / FTSWORN

gen sup_pct_white = PERS_SUP_INTM_WH / PERS_SUP_INTM_TOTR
gen sup_pct_black = PERS_SUP_INTM_BK  / PERS_SUP_INTM_TOTR
gen sup_pct_hisp = PERS_SUP_INTM_HS / PERS_SUP_INTM_TOTR
gen sup_pct_asian = PERS_SUP_INTM_AS  / PERS_SUP_INTM_TOTR

keep CITY STATE AGENCYNAME police_* sup_pct_*

gen city = CITY
replace city = subinstr(city, " ", "", .)
replace city = "NEWYORKCITY" if city == "NEWYORK"
save "${DataRaw}/LEMAS_2016_cities_racial_share.dta",replace


***************************************************
* City Race Data + Places Code 
***************************************************

use "${DataRaw}/acs13-17_places_demographics.dta", clear
gsort -tot_pop_acs_13_17
keep if _n<=30

drop if place == "Memphis city" | place == "Portland city" | place == "Las Vegas city" | place == "Baltimore city" | place == "Louisville/Jefferson County metro government (balance)"

replace place = upper(place)
replace place = subinstr(place," CITY","",.)
replace place = subinstr(place," ","",.)
replace place = "INDIANAPOLIS" if place == "INDIANAPOLIS(BALANCE)"
replace place = "NASHVILLE" if place == "NASHVILLE-DAVIDSONMETROPOLITANGOVERNMENT(BALANCE)"
replace place = "OKLAHOMACITY" if place == "OKLAHOMA"
replace place = "NEWYORKCITY" if place == "NEWYORK"

ren place city
gen fips_st_place_code = statea + placea

keep city fips_st_place_code pct_nh_white_alone_acs_13_17 pct_nh_blk_alone_acs_13_17 pct_nh_asian_alone_acs_13_17 pct_hisp_latino_acs_13_17 tot_pop_acs_13_17 med_hh_inc_acs_2017 pct_college_acs_13_17

ren pct_* city_pct*
ren med_* city_med_*
ren tot_pop_acs_13_17 city_tot_pop_acs_13_17

save "${DataRaw}/city_race_acs_13_17.dta",replace

***************************************************
* LEOKA officer count data for the 25 cities
***************************************************

use "${DataRaw}/leoka_2017_officers_count.dta",clear

gen fips_st_place_code = fips_state_code + fips_place_code
replace fips_st_place_code = "3712000" if agency_name == "charlotte-mecklenburg"

merge m:1 fips_st_place_code using "${DataRaw}/city_race_acs_13_17.dta", keep(match) keepusing(city) nogenerate

drop if population == 0  

keep if agency_type == "local police department"
keep fips_st_place_code city total_employees_officers total_employees_civilians total_employees_officers

save "${DataRaw}/leoka_2017_officers_count_25cities.dta",replace

************************************
* merge phone stats with Leoka/LEMAS 
*************************************

use "${DataRaw}/patrol_phones_stats_by_city.dta",clear

* merge w/ LEOKA data
merge 1:1 city using "${DataRaw}/leoka_2017_officers_count_25cities.dta", nogenerate ///
	keepusing(total_employees_officers)

* merge w/ LEMAS data
merge 1:1 city using "${DataRaw}/LEMAS_2016_cities_racial_share.dta", nogenerate ///
	keepusing(police_pct_white police_pct_black police_pct_hisp police_pct_asian)

* merge w/ city race composition (ACS 13-17)
merge 1:1 city using "${DataRaw}/city_race_acs_13_17.dta", nogenerate ///
	keepusing(city_tot_pop_acs_13_17 city_pcthisp_latino_acs_13_17 city_pctnh_white_alone_acs_13_17 city_pctnh_blk_alone_acs_13_17 city_pctnh_asian_alone_acs_13_17)

* to create 45 degree line
gen linevar = 0.1* _n - 0.1
replace linevar = 1 if linevar > 1

* create label for city 
gen city2 = proper(city)
replace city2 = "Los Angeles" if city2 == "Losangeles"
replace city2 = "NYC" if city2 == "Newyorkcity"
replace city2 = "San Francisco" if city2 == "Sanfrancisco"
replace city2 = "El Paso" if city2 == "Elpaso"
replace city2 = "San Diego" if city2 == "Sandiego"
replace city2 = "San Antonio" if city2== "Sanantonio"
replace city2 = "Oklahoma City" if city2 == "Oklahomacity"
replace city2 = "Fort Worth" if city2 == "Fortworth"

* create another label for table
gen city3 = city2
replace city3 = "New York City" if city2 == "NYC"

* label variables
label variable police_pct_white "Police: % White"
label variable police_pct_black "Police: % Black"
label variable police_pct_hisp "Police: % Hispanic"
label variable police_pct_asian "Police: % Asian"

label variable phones_white "Smartphone: % White"
label variable phones_black "Smartphone: % Black"
label variable phones_hisp "Smartphone: % Hispanic"
label variable phones_asian "Smartphone: % Asian"

label variable city_pctnh_white_alone_acs_13_17 "City % White"
label variable city_pctnh_blk_alone_acs_13_17 "City % Black"
label variable city_pcthisp_latino_acs_13_17 "City % Hispanic"
label variable city_pctnh_asian_alone_acs_13_17 "City % Asian"
label variable city_tot_pop_acs_13_17 "City Total Population"

label variable city3 "City"
label variable N_patrol_phones_2017 "Patrol Phone Counts"
label variable total_employees_officers "Officer Counts (UCR)"

* drop cities 
drop if inlist(city, "SANJOSE", "JACKSONVILLE", "SEATTLE", "ELPASO")

save "${DataCleaned}/phones_LEMAS_LEOKA_validate.dta",replace

**********************************************************************
** Data preparation: NYPD's officer's home information 
**********************************************************************
** https://github.com/Bellspringsteen/other.nyc/blob/master/NYCGOV/NYPD/NypdOfficersHomeZip/data/NYPDFoilOfficerHomeZipData.csv
** reshape this csv from wide to long
import delimited "${DataRaw}/NYPDFoilOfficerHomeZipData.csv",clear
set more off
foreach var of varlist v* {
    local label : variable label `var'
    if ("`label'" != "") {
        local oldnames `oldnames' `var'
        local newnames `newnames' _`label'
    }
	
}

rename (`oldnames')(`newnames')

keep if command=="Total : "

drop lat lng zip

drop command

foreach var of varlist _* {
	replace `var' = 0 if `var'==.
}

gen i = 1
reshape long _, i(i) j(zipcode)

gen zipcode_str = string(zipcode)
replace zipcode_str = "07631" if zipcode==7631
replace zipcode_str = "07652" if zipcode==7652
replace zipcode_str = "07728" if zipcode==7728

ren _ N_officer

replace N_officer = _07631 if zipcode==7631
replace N_officer = _07652 if zipcode==7652
replace N_officer = _07728 if zipcode==7728

drop i _07631 _07652 _07728 total

save "${DataRaw}/NYPDFoilOfficerHomeZipData_long.dta",replace

***************************************************************************************************
* prepare NY 311 call data 
***************************************************************************************************
/*
* The data 311_Service_Requests_from_2010_to_Present.csv is too large (over 15 GB)
* so only post cleaned 2016 311 service requests data 

import delimited "${dropbox}/Data/311_calls/311_Service_Requests_from_2010_to_Present.csv", clear

sort uniquekey
drop if uniquekey == uniquekey[_n-1]
drop if missing(latitude)

* assign lat,long to census block group *
set more off
geoinpoly latitude longitude using "${BG_boundary}/tl_2019_36_bg_coor.dta"

merge m:1 _ID using "${BG_boundary}/tl_2019_36_bg_data.dta", keepusing(GEOID) keep(master match) nogenerate

ren GEOID gidbg
destring gidbg , replace
drop _ID

save "${DataRaw}/311_Service_Requests_2016_matched_bg.dta", replace
*/

use "${DataRaw}/311_Service_Requests_2016_matched_bg.dta", clear

* calculate number of all calls in BGs *
gegen tot_311_calls = count(uniquekey), by(gidbg)

* calls handled by different agency
set more off
local agencies "NYPD HPD DOT DEP DSNY DOB DPR DOHMH DHS"
foreach agency of local agencies{

	gen byte `agency'_call = (agency == "`agency'")
	gegen tot_311_calls_`agency' = total(`agency'_call), by(gidbg)

}

sort gidbg 
drop if gidbg == gidbg[_n-1]

keep gidbg tot_311*

save "${DataRaw}/311_calls_bg_count_NYC_2016.dta",replace


***************************************************************************************************
* prepare Homicide data
***************************************************************************************************

*************************************
* Calculate yearly homicide count
*************************************
set more off

import delimited "${DataRaw}/homicide-data.csv",clear

gen year = floor(reported_date/10000)

keep uid city state lat lon reported_date uid year

* append data 
append using "${DataRaw}/homicide-data-Austin.dta" 
append using "${DataRaw}/NYC_homicide_13_15.dta" 

* keep city 
gen keep_obs = 0
replace keep_obs = 1 if inlist(city, "Austin", "Boston", "Charlotte", "Chicago", "Columbus")
replace keep_obs = 1 if inlist(city, "Dallas", "Denver", "Detroit", "Fort Worth", "Houston")
replace keep_obs = 1 if inlist(city, "Indianapolis", "Los Angeles", "Nashville", "New York", "Oklahoma City")
replace keep_obs = 1 if inlist(city, "Philadelphia", "Phoenix", "San Antonio", "San Diego", "San Francisco", "Washington")

keep if keep_obs == 1
drop keep_obs

* match the lat/long to a census block group * 
set more off
local sts "04 06 08 11 17 18 25 26 36 37 39 40 42 47 48"
foreach st of local sts{
geoinpoly lat lon using "${BG_boundary}/tl_2019_`st'_bg_coor.dta"
ren _ID _ID_`st'
}

local sts "04 06 08 11 17 18 25 26 36 37 39 40 42 47 48"
foreach st of local sts{
	ren _ID_`st' _ID
	merge m:1 _ID using "${BG_boundary}/tl_2019_`st'_bg_data.dta", keepusing(GEOID) keep(master match) nogenerate
	ren _ID _ID_`st'
	ren GEOID GEOID_`st'
}

gen gidbg = ""
local sts "04 06 08 11 17 18 25 26 36 37 39 40 42 47 48"
foreach st of local sts{
replace gidbg = GEOID_`st' if ~missing(_ID_`st')
}

assert ~missing(gidbg) if ~missing(lat)
drop GEOID_* _ID_*

* calculate 
gegen homicide_yearly_count = count(reported_date), by(gidbg year) // count(varlist) must call a numeric variable list.

sort gidbg year
drop if gidbg==gidbg[_n-1] & year==year[_n-1]
drop if missing(gidbg)

keep year state homicide_yearly_count gidbg city
save "${DataCleaned}/bg_homicide_count_by_year.dta",replace

keep if year == 2016
destring gidbg, replace
save "${DataCleaned}/bg_homicide_count_2016.dta",replace

* note: need to fill in the zero for bgs without homicide * 
use "${DataCleaned}/all_bgs_in_21cities.dta", clear
drop if city == "SanJose"
merge 1:1 gidbg using "${DataCleaned}/bg_homicide_count_2016.dta", keep(master match)
replace homicide_yearly_count = 0 if homicide_yearly_count==. & city != "Seattle" & city != "ElPaso" // _merge == 1 means Seattle, and El Paso
drop _merge
save "${DataCleaned}/bg_homicide_count_2016.dta",replace

*********************************************
* calculate avg homicide count from 13-16 (for 21 cities)
*********************************************
use "${DataCleaned}/bg_homicide_count_by_year.dta",clear

replace year = 2015 if year==20151
keep if year>=2013 & year<=2016

gegen homicide_count_13_16_avg = mean(homicide_yearly_count), by(gidbg)

sort gidbg 
drop if gidbg==gidbg[_n-1]
keep city gidbg homicide_count_13_16_avg

destring gidbg, replace
save "${DataCleaned}/bg_homicide_count_13_16.dta",replace

* fill in zero values for other BGs 
use "${DataCleaned}/all_bgs_in_21cities.dta", clear
drop if city == "SanJose"
merge 1:1 gidbg using "${DataCleaned}/bg_homicide_count_13_16.dta", keep(master match)
replace homicide_count_13_16_avg = 0 if homicide_count_13_16_avg ==. & city != "Seattle" & city != "ElPaso" 
drop _merge
save "${DataCleaned}/bg_homicide_count_13_16.dta", replace

*********************************************
* Calculate the nearest distance to homicide for each block group
*********************************************
set more off
forval y = 2013/2016{
	import delimited "${DataRaw}/homicide-data.csv",clear
	gen year = floor(reported_date/10000)
	replace year = 2015 if year==20151
	keep uid city state lat lon reported_date uid year
	append using "${DataRaw}/homicide-data-Austin.dta" 
	append using "${DataRaw}/NYC_homicide_13_15.dta" 

	gen keep_obs = 0
	replace keep_obs = 1 if inlist(city, "Austin", "Boston", "Charlotte", "Chicago", "Columbus")
	replace keep_obs = 1 if inlist(city, "Dallas", "Denver", "Detroit", "Fort Worth", "Houston")
	replace keep_obs = 1 if inlist(city, "Indianapolis", "Los Angeles", "Nashville", "New York", "Oklahoma City")
	replace keep_obs = 1 if inlist(city, "Philadelphia", "Phoenix", "San Antonio", "San Diego", "San Francisco", "Washington")

	keep if keep_obs == 1
	drop keep_obs
	
	keep if year == `y'
	gen homicide_id`y' = _n
	save "${DataCleaned}/homicide_data_`y'.dta",replace
}


* calculate to the nearest homicide by year using -geonear- * 
set more off
forval yr = 13/16{
use "${DataCleaned}/all_bgs_in_21cities.dta", clear
drop if city == "SanJose"
gen bg_id = _n
geonear bg_id INTPTLAT INTPTLON using "${DataCleaned}/homicide_data_20`yr'.dta", ///
	neighbors(homicide_id20`yr' lat lon) 
ren km_to_nid km_to_nearest_homicide`yr'
keep gidbg km_to_nearest_homicide`yr' city

replace km_to_nearest_homicide`yr' = . if city == "NY" & `yr' <= 12
replace km_to_nearest_homicide`yr' = . if city == "SanAntonio" & `yr' <= 12
replace km_to_nearest_homicide`yr' = . if city == "Seattle" | city == "ElPaso"

save "${DataCleaned}/bg_to_nearest_homicide`yr'.dta",replace
}

set more off
use "${DataCleaned}/bg_to_nearest_homicide13.dta",clear
forval yr = 14/16{
	merge 1:1 gidbg using "${DataCleaned}/bg_to_nearest_homicide`yr'.dta", nogenerate
}
save "${DataCleaned}/bg_to_nearest_homicide.dta",replace

* remove intermediate temporary data 
forval yr = 13/16{
	rm "${DataCleaned}/bg_to_nearest_homicide`yr'.dta"
	rm "${DataCleaned}/homicide_data_20`yr'.dta"
}

********************************
* merge all homicide-related data
********************************
use "${DataCleaned}/bg_homicide_count_13_16.dta", clear
merge m:1 gidbg using "${DataCleaned}/bg_to_nearest_homicide.dta", nogenerate
merge m:1 gidbg using "${DataCleaned}/bg_homicide_count_2016.dta", nogenerate

ren homicide_yearly_count homicide_count_16
label variable homicide_count_16 "Homicide Count 2016"
label variable homicide_count_13_16_avg "Avg 13-16 Homicide Count"

forval yr = 13/16{
	label variable km_to_nearest_homicide`yr' "Distance to nearest 20`yr' homicide (km)"
}

drop state year
save "${DataCleaned}/bg_homicide_cleaned.dta", replace

* remove intermediate temporary data 
rm "${DataCleaned}/bg_homicide_count_13_16.dta"
rm "${DataCleaned}/bg_to_nearest_homicide.dta"
rm "${DataCleaned}/bg_homicide_count_2016.dta"
rm "${DataCleaned}/bg_homicide_count_by_year.dta"

***************************************************************************************************
** Prepare block group arrest count data 
** 6 cities: Chicago, DC, Austin, Dallas, LA, NYC
***************************************************************************************************

***********************************
** Chicago arrest data
***********************************

use "${Arrest_Stop}/Chicago_Arrest_2017.dta", clear

* match it to census block group * 
geoinpoly lat_geocode lng_geocode using "${BG_boundary}/tl_2019_17_bg_coor.dta"
merge m:1 _ID using "${BG_boundary}/tl_2019_17_bg_data.dta", keepusing(GEOID) keep(master match) nogenerate

ren GEOID gidbg
destring gidbg, replace 
drop _ID

drop if missing(gidbg) // 20 observations are dropped
sort cbno
drop if cbno == cbno[_n-1] // drop duplicate arrest observations - here no observations are dropped

* count arrest by block group 
gegen bg_arrest_count = count(cbno), by(gidbg) // including multiple invidivuals at the same time

sort gidbg
drop if gidbg == gidbg[_n-1]
keep gidbg bg_arrest_count

gen city = "CHICAGO"

save "${DataCleaned}/Chicago_arrests_2017_bg_count.dta", replace 

***********************************
** Washington DC arrest data ** 
***********************************

import delimited "${Arrest_Stop}/Adult_Arrests_DC.csv", clear

* keep 2017 data
keep if year == 2017

drop if missing(arrest_latitude) // 451

* match lat-long to census block group
geoinpoly arrest_latitude arrest_longitude using "${BG_boundary}/tl_2019_11_bg_coor.dta"
merge m:1 _ID using "${BG_boundary}/tl_2019_11_bg_data.dta", keepusing(GEOID) keep(master match) nogenerate

ren GEOID gidbg
destring gidbg , replace 
drop _ID
drop if missing(gidbg) // 123 observations are dropped

* drop duplicates if needed
sort objectid
drop if objectid == objectid[_n-1]

* count arrest by block group 
gegen bg_arrest_count = count(objectid), by(gidbg) // including multiple invidivuals at the same time

* collapse data to block group 
sort gidbg
drop if gidbg == gidbg[_n-1]
keep gidbg bg_arrest_count

gen city = "WASHINGTON"

save "${DataCleaned}/washington_dc_arrests_2017_bg_count.dta", replace 

***********************************
* Austin Arrest Data
***********************************

use "${Arrest_Stop}/Austin_Arrest_2017.dta", clear 

* match lat/long to census block group
geoinpoly lat_geocode lng_geocode using "${BG_boundary}/tl_2019_48_bg_coor.dta"
merge m:1 _ID using "${BG_boundary}/tl_2019_48_bg_data.dta", keepusing(GEOID) keep(master match) nogenerate

ren GEOID gidbg
destring gidbg , replace 
drop _ID

drop if missing(gidbg) // 20 observations are dropped

sort primary_key
drop if primary_key == primary_key[_n-1] // drop duplicate arrest observations 

* count arrest by block group 
gegen bg_arrest_count = count(primary_key), by(gidbg) // including multiple invidivuals at the same time

sort gidbg
drop if gidbg == gidbg[_n-1]
keep gidbg bg_arrest_count

gen city = "AUSTIN"
save "${DataCleaned}/austin_arrests_2017_bg_count.dta", replace 

***********************************
** Dallas Arrest Data
***********************************

use "${Arrest_Stop}/Dallas_Arrest_2017.dta", clear

* match it to census block group
geoinpoly lat_geocode lng_geocode using "${BG_boundary}/tl_2019_48_bg_coor.dta"
merge m:1 _ID using "${BG_boundary}/tl_2019_48_bg_data.dta", keepusing(GEOID) keep(master match) nogenerate

ren GEOID gidbg
destring gidbg , replace 
drop _ID
drop if missing(gidbg) // 1 observations are dropped

* drop duplicates 
sort incidentnum
drop if incidentnum == incidentnum[_n-1]

* count arrest by block group 
gegen bg_arrest_count = count(arrestyr), by(gidbg) // including multiple invidivuals at the same time

sort gidbg
drop if gidbg == gidbg[_n-1]
keep gidbg bg_arrest_count

gen city = "DALLAS"
save "${DataCleaned}/dallas_arrests_2017_bg_count.dta", replace 

***********************************
** LAPD arrest data (2017)
***********************************

import delimited "${Arrest_Stop}/LA_Arrest_Data_from_2010_to_2019.csv", clear

gen year = substr(arrestdate, 7, 4)
destring year, replace  
keep if year == 2017

gunique reportid // No duplicates
drop if lat == 0

* match lat/lon to census block group
geoinpoly lat lon using "${BG_boundary}/tl_2019_06_bg_coor.dta"
merge m:1 _ID using "${BG_boundary}/tl_2019_06_bg_data.dta", keepusing(GEOID) keep(master match) nogenerate

ren GEOID gidbg
destring gidbg , replace 
drop _ID

* count arrest by block group 
gegen bg_arrest_count = count(reportid), by(gidbg)

sort gidbg
drop if gidbg == gidbg[_n-1]
keep gidbg bg_arrest_count

gen city = "LOSANGELES"

save "${DataCleaned}/los_angeles_arrests_2017_bg_count.dta", replace 

***********************************
** NYPD arrest data (2017)
***********************************

import delimited "${Arrest_Stop}/NYPD_Arrests_Data__Historic_.csv", clear

gen year = substr(arrest_date, 7, 4)
destring year, replace  

keep if year == 2017

* Check duplicates 
gunique arrest_key // No duplicates
drop x_coord_cd y_coord_cd lon_lat

* match lat/lon to census block group
geoinpoly latitude longitude using "${BG_boundary}/tl_2019_36_bg_coor.dta"
merge m:1 _ID using "${BG_boundary}/tl_2019_36_bg_data.dta", keepusing(GEOID) keep(master match) nogenerate

ren GEOID gidbg
destring gidbg , replace 
drop _ID

* count arrest by block group 
gegen bg_arrest_count = count(arrest_key), by(gidbg)

sort gidbg
drop if gidbg == gidbg[_n-1]
keep gidbg bg_arrest_count

gen city = "NEWYORKCITY"

save "${DataCleaned}/new_york_city_arrests_2017_bg_count.dta", replace 

***********************************
* append arrest data 
***********************************
use "${DataCleaned}/new_york_city_arrests_2017_bg_count.dta", clear
append using "${DataCleaned}/los_angeles_arrests_2017_bg_count.dta"
append using "${DataCleaned}/dallas_arrests_2017_bg_count.dta"
append using "${DataCleaned}/austin_arrests_2017_bg_count.dta"
append using  "${DataCleaned}/washington_dc_arrests_2017_bg_count.dta"
append using "${DataCleaned}/Chicago_arrests_2017_bg_count.dta"

keep gidbg bg_arrest_count city
gen arrest_year = 2017 

save "${DataCleaned}/cities_arrest_bg_count.dta",replace 

* remove intermediate data 
local filelist: dir "${DataCleaned}/" files "*arrests_2017_bg_count.dta"
clear
foreach f of local filelist{
 rm "${DataCleaned}/`f'"
}

***************************************************************************************************
* Prepare block group stop count data 
***************************************************************************************************

***********************************
** Denver stop data
***********************************

* Read csv and save as stata file * 
import delimited "${Arrest_Stop}/Denver_police_pedestrian_stops_and_vehicle_stops.csv", clear

* keep 2017 data
gen year = substr(time_phonepickup, 1, 4)
destring year, replace  

keep if year == 2017

* drop duplicates
sort master_incident_number
drop if master_incident_number == master_incident_number[_n-1]

* match lat/lon to census block group
geoinpoly geo_lat geo_lon using "${BG_boundary}/tl_2019_08_bg_coor.dta"
merge m:1 _ID using "${BG_boundary}/tl_2019_08_bg_data.dta", keepusing(GEOID) keep(master match) nogenerate
ren GEOID gidbg
destring gidbg , replace 

* count stop by block group 
gegen bg_stop_count = count(_ID), by(gidbg)
drop _ID

sort gidbg
drop if gidbg == gidbg[_n-1]
keep gidbg bg_stop_count

gen city = "DENVER"

save "${DataCleaned}/denver_stop_2017_bg_count.dta", replace 

***********************************
** NYC
***********************************

use "${Arrest_Stop}/NYPD_sqf-cy2017.dta", clear 

* match lat/lon to census block group
geoinpoly lat_geocode lng_geocode using "${BG_boundary}/tl_2019_36_bg_coor.dta"
merge m:1 _ID using "${BG_boundary}/tl_2019_36_bg_data.dta", keepusing(GEOID) keep(master match) nogenerate

ren GEOID gidbg
destring gidbg , replace 
drop _ID 

drop if missing(gidbg) // 9 observations are dropped

* count stop by block group 
gegen bg_stop_count = count(STOP_FRISK_ID), by(gidbg) // including multiple invidivuals at the same time

sort gidbg
drop if gidbg == gidbg[_n-1]
keep gidbg bg_stop_count

gen city = "NEWYORKCITY"

save "${DataCleaned}/new_york_city_stop_2017_bg_count.dta", replace 

***********************************
** Chicago 2015 stop data from Bocar 2021 et al
***********************************
* Read csv and save as stata file * 
import delimited "${Arrest_Stop}/Chicago_stops.csv",clear

gen year = substr(date, 1, 4)
destring year, replace  

keep if year == 2015

* gunique stop_id // there have been duplicates of stop_id, with each stop_id associated with different officers
* drop duplicates of stop events 
sort stop_id 
drop if stop_id == stop_id[_n-1]

gen byte missing_location = (lat == "NA")
tab missing_location

replace lat = "." if lat == "NA"
replace lon = "." if lon == "NA"
destring lat lon, replace 

* match lat/lon to census block group
geoinpoly lat lon using "${BG_boundary}/tl_2019_17_bg_coor.dta"
merge m:1 _ID using "${BG_boundary}/tl_2019_17_bg_data.dta", keepusing(GEOID) keep(master match) nogenerate

ren GEOID gidbg
destring gidbg , replace 
drop _ID

drop if missing_location

* count stop by block group 
gegen bg_stop_count = count(stop_id), by(gidbg)

sort gidbg
drop if gidbg == gidbg[_n-1]
keep gidbg bg_stop_count

gen city = "CHICAGO"

save "${DataCleaned}/Chicago_stop_2015_bg_count.dta", replace 

***********************************
** Philly stop data
***********************************
import delimited using "${Arrest_Stop}/Philly_car_ped_stops.csv",clear

drop if lat == .

* match lat/lon to census block group
geoinpoly lat lng using "${BG_boundary}/tl_2019_42_bg_coor.dta"
merge m:1 _ID using "${BG_boundary}/tl_2019_42_bg_data.dta", keepusing(GEOID) keep(master match) nogenerate

ren GEOID gidbg
destring gidbg , replace 
drop _ID

* count stop by block group 
gegen bg_stop_count = count(objectid), by(gidbg)

sort gidbg
drop if gidbg == gidbg[_n-1]
keep gidbg bg_stop_count

gen city = "PHILADELPHIA"

save "${DataCleaned}/Philadelphia_stop_2017_bg_count.dta", replace 

***********************************
** San antonio stop data
***********************************
* Read csv and save as stata file * 
import delimited "${Arrest_Stop}/yg821jf8611_tx_san_antonio_2020_04_01.csv", clear varnames(1)

* keep 2017 data
gen year = substr(date, 1, 4)
destring year, replace  
keep if year == 2017

replace  lat = "." if lat == "NA"
replace  lng = "." if lng == "NA"
destring lat lng, replace 
drop if missing(lat) // only 214 observations are dropped

* match lat/lon to census block group
geoinpoly lat lng using "${BG_boundary}/tl_2019_48_bg_coor.dta"
merge m:1 _ID using "${BG_boundary}/tl_2019_48_bg_data.dta", keepusing(GEOID) keep(master match) nogenerate

ren GEOID gidbg
destring gidbg , replace 
drop _ID

* count stop by block group 
gegen bg_stop_count = count(v1), by(gidbg)

sort gidbg
drop if gidbg == gidbg[_n-1]
keep gidbg bg_stop_count

gen city = "SANANTONIO"

save "${DataCleaned}/san_antonio_stop_2017_bg_count.dta", replace 

***********************************
** Houston stop data
***********************************
* Read csv and save as stata file * 
import delimited "${Arrest_Stop}/tx_houston_2020_04_01.csv", varnames(1) encoding(ISO-8859-1)clear

* keep 2017 data
gen year = substr(date, 1, 4)
destring year, replace  
keep if year == 2017

replace  lat = "." if lat == "NA"
replace  lng = "." if lng == "NA"
destring lat lng, replace 
drop if missing(lat)  

* match lat/lon to census block group
geoinpoly lat lng using "${BG_boundary}/tl_2019_48_bg_coor.dta"
merge m:1 _ID using "${BG_boundary}/tl_2019_48_bg_data.dta", keepusing(GEOID) keep(master match) nogenerate

ren GEOID gidbg
destring gidbg , replace 
drop _ID

* count stop by block group 
gegen bg_stop_count = count(year), by(gidbg)

sort gidbg
drop if gidbg == gidbg[_n-1]
keep gidbg bg_* bg_stop_count

gen city = "HOUSTON"

save "${DataCleaned}/houston_stop_2017_bg_count.dta", replace 

***********************************
** oklahoma city stop data
***********************************
* Read csv and save as stata file * 
import delimited "${Arrest_Stop}/yg821jf8611_ok_oklahoma_city_2020_04_01.csv", clear

* drop duplicates
sort raw_row_number
drop if raw_row_number == raw_row_number[_n-1]

gen year = substr(date, 1, 4)
destring year, replace  

* note: the closest year w/ complete annual data (to 2017) is 2016 and thus keep 2016 data
* 2017 data is not complete (missing for November and December)
keep if year == 2016 

replace lat = "." if lat == "NA"
replace lng = "." if lng == "NA"
destring lat lng, replace 
drop if missing(lat)  

* match lat/lon to census block group
geoinpoly lat lng using "${BG_boundary}/tl_2019_40_bg_coor.dta"
merge m:1 _ID using "${BG_boundary}/tl_2019_40_bg_data.dta", keepusing(GEOID) keep(master match) nogenerate

ren GEOID gidbg
destring gidbg , replace 
drop _ID

* count stop by block group 
gegen bg_stop_count = count(v1), by(gidbg)

sort gidbg
drop if gidbg == gidbg[_n-1]
keep gidbg bg_stop_count

gen city = "OKLAHOMACITY"

save "${DataCleaned}/oklahoma_city_stop_2016_bg_count.dta", replace 

***********************************
** columbus stop data
***********************************
* Read csv and save as stata file * 
import delimited "${Arrest_Stop}/oh_columbus_2020_04_01.csv", varnames(1) encoding(ISO-8859-1)clear

gen year = substr(date, 1, 4)
destring year, replace  

* note: the closest year (to 2017) is 2016 and thus keep 2016 data
keep if year == 2016  

replace  lat = "." if lat == "NA"
replace  lng = "." if lng == "NA"

destring lat lng, replace 

gen byte missing_location = (lat == .)
tab missing_location  //   8.53%

* match lat/lon to census block group
geoinpoly lat lng using "${BG_boundary}/tl_2019_39_bg_coor.dta"
merge m:1 _ID using "${BG_boundary}/tl_2019_39_bg_data.dta", keepusing(GEOID) keep(master match) nogenerate

ren GEOID gidbg
destring gidbg , replace 
drop _ID

* count stop by block group 
gegen bg_stop_count = count(gidbg), by(gidbg)

sort gidbg
drop if gidbg == gidbg[_n-1]
keep gidbg bg_stop_count

gen city = "COLUMBUS"

save "${DataCleaned}/columbus_stop_2016_bg_count.dta", replace 

***********************************
** nashville stop data
***********************************
* Read csv and save as stata file * 
import delimited "${Arrest_Stop}/yg821jf8611_tn_nashville_2020_04_01.csv",  clear

replace  lat = "." if lat == "NA"
replace  lng = "." if lng == "NA"
destring lat lng, replace 
*gen byte missing_location = (lat == .)
*tab missing_location  //  7.65%
drop if missing(lat)

geoinpoly lat lng using "${BG_boundary}/tl_2019_47_bg_coor.dta"
merge m:1 _ID using "${BG_boundary}/tl_2019_47_bg_data.dta", keepusing(GEOID) keep(master match) nogenerate

ren GEOID gidbg
destring gidbg , replace 
drop _ID

gegen bg_stop_count = count(gidbg), by(gidbg)

sort gidbg
drop if gidbg == gidbg[_n-1]
keep gidbg bg_stop_count

gen city = "NASHVILLE"

save "${DataCleaned}/nashville_stop_2017_bg_count.dta", replace 

***********************************
* append stop data
***********************************
set more off
local filelist: dir "${DataCleaned}/" files "*stop_20*_bg_count.dta"
clear
foreach f of local filelist{
append using "${DataCleaned}/`f'"
}

gen stop_data_year = 2017 if inlist(city, "NEWYORKCITY", "DENVER", "HOUSTON", "NASHVILLE", "SANANTONIO", "PHILADELPHIA")
replace  stop_data_year = 2015 if city == "CHICAGO"
replace  stop_data_year = 2016 if inlist(city, "COLUMBUS", "OKLAHOMACITY")

keep gidbg bg_stop_count city stop_data_year

save "${DataCleaned}/cities_stop_bg_count.dta",replace 

* remove intermediate data 
local filelist: dir "${DataCleaned}/" files "*stop_20*_bg_count.dta"
clear
foreach f of local filelist{
 rm "${DataCleaned}/`f'"
}


***************************************************************************************************
** match all cities' stop / arrest data w/ cell phone data: prepare for regression
***************************************************************************************************
use "${DataCleaned}/patrol_time_in_bg.dta", clear

* keep cities with stop OR arrest data //
* inlist cannot include more than 10 cities : expression too long
gen city_keep = 1 if inlist(city, "DENVER", "LOSANGELES", "PHILADELPHIA", "NEWYORKCITY", "CHICAGO", "COLUMBUS")
replace  city_keep = 1 if inlist(city, "HOUSTON", "NASHVILLE", "SANANTONIO", "OKLAHOMACITY", "AUSTIN", "DALLAS", "WASHINGTON")
keep if city_keep == 1
drop city_keep

* merge with stop counts
merge 1:1 gidbg city using "${DataCleaned}/cities_stop_bg_count.dta", keep(master match) 
ren _merge _merge_stop

* merge w/ arrest counts 
merge 1:1 gidbg city using "${DataCleaned}/cities_arrest_bg_count.dta", keep(master match)
ren _merge _merge_arrest

* fill in zeros for stop/arrest data
replace  bg_stop_count = 0 if missing(bg_stop_count) & _merge_stop == 1 & city != "LOSANGELES" & city != "DALLAS" & city != "AUSTIN" & city != "WASHINGTON" // _merge == 1 means block groups within the precincts but no showing up in the police stop data
replace  bg_arrest_count = 0 if missing(bg_arrest_count) & _merge_arrest == 1 & inlist(city, "LOSANGELES", "NEWYORKCITY", "CHICAGO", "DALLAS", "AUSTIN","WASHINGTON")

* merge with bg-level demographics
merge m:1 gidbg using "${DataRaw}/acs13-17_bg_demographics.dta", nogenerate keep(master match) ///
	keepusing(tot_population_acs_13_17 mail_return_rate_cen_2010 pct_nh_white_alone_acs_13_17 pct_hispanic_acs_13_17 pct_nh_blk_alone_acs_13_17 pct_nh_asian_alone_acs_13_17 pct_college_acs_13_17 med_hhd_inc_bg_acs_13_17)

* transform variable to 0-1
local vars "mail_return_rate_cen_2010 pct_nh_white_alone_acs_13_17 pct_hispanic_acs_13_17 pct_nh_blk_alone_acs_13_17 pct_nh_asian_alone_acs_13_17 pct_college_acs_13_17"
foreach var of local vars{
replace `var' = `var'/100
}

* change the unit of income 
replace  med_hhd_inc_bg_acs_13_17 = med_hhd_inc_bg_acs_13_17 / 1000

* merge to get homicide data 
merge m:1 gidbg using  "${DataCleaned}/bg_homicide_cleaned.dta", keep(master match) nogenerate ///
	keepusing(homicide_count_16 km_to_nearest_homicide16)
		
* generate log population
gen double ln_population_13_17 = log(tot_population_acs_13_17)

* generate asinh(hour/stop/arrest)
local vars "hour_in_bg N_shifts_in_bg bg_stop_count bg_arrest_count"
foreach var of local vars{
	gen asinh_`var' = asinh(`var')
}

* generate arrest/hour; stop/hour and asinh values
gen arrest_to_hour = bg_arrest_count / hour_in_bg
gen stop_to_hour = bg_stop_count / hour_in_bg 
gen asinh_arrest_to_hour = asinh(arrest_to_hour)
gen asinh_stop_to_hour = asinh(stop_to_hour)

* label variables 
label variable tot_population_acs_13_17 "Population"
label variable ln_population_13_17 "Log Population"
label variable pct_college_acs_13_17 "% College Graduates"
label variable med_hhd_inc_bg_acs_13_17 "Median Household Income (1K)"
label variable mail_return_rate_cen_2010 "Census Form Return Rate"
label variable pct_hispanic_acs_13_17 "% Hispanic"
label variable pct_nh_asian_alone_acs_13_17 "% Asian"
label variable pct_nh_blk_alone_acs_13_17 "% Black"
label variable pct_nh_white_alone_acs_13_17 "% White"
label variable homicide_count_16 "Homicide Count 2016"
label variable km_to_nearest_homicide16 "Distance to nearest 2016 homicide (km)"

label variable hour_in_bg "Hour in BG"
label variable N_officers_in_bg "Number of officers"
label variable N_shifts_in_bg "Number of shifts in BG"
label variable asinh_hour_in_bg "arsinh(Hour)"
label variable asinh_N_shifts_in_bg "arsinh(Number of shifts)"

label variable bg_stop_count "Number of stops"
label variable bg_arrest_count "Number of arrests"
label variable asinh_bg_stop_count "arsinh(Number of stops)"
label variable asinh_bg_arrest_count "arsinh(Number of arrests)"

label variable arrest_to_hour "Arrests/Hours"
label variable stop_to_hour "Stops/Hours"
label variable asinh_arrest_to_hour "arsinh(Arrests/Hours)"
label variable asinh_stop_to_hour "arsinh(Stops/Hours)"

save "${DataCleaned}/bg_stop_arrest_per_hour_for_reg.dta", replace 

***************************************************************************************************
* prepare main regression data
***************************************************************************************************

set more off

cap program drop prepare_reg_data
program define prepare_reg_data

	* merge with LEMAS: police racial share for each city
	merge m:1 city using "${DataRaw}/LEMAS_2016_cities_racial_share.dta", nogenerate keep(master match) ///
		keepusing(police_pct_white police_pct_black police_pct_hisp police_pct_asian sup_pct_white sup_pct_black sup_pct_hisp sup_pct_asian)
	 
	* merge with bg-level demographics
	merge m:1 gidbg using "${DataRaw}/acs13-17_bg_demographics.dta", nogenerate keep(master match) ///
	keepusing(tot_population_acs_13_17 mail_return_rate_cen_2010 pct_nh_white_alone_acs_13_17 pct_hispanic_acs_13_17 pct_nh_blk_alone_acs_13_17 pct_nh_asian_alone_acs_13_17 pct_college_acs_13_17 med_hhd_inc_bg_acs_13_17)

	* transform variable to 0-1
	local vars "mail_return_rate_cen_2010 pct_nh_white_alone_acs_13_17 pct_hispanic_acs_13_17 pct_nh_blk_alone_acs_13_17 pct_nh_asian_alone_acs_13_17 pct_college_acs_13_17"
	foreach var of local vars{
	replace `var' = `var'/100
	}

	* change the unit of income 
	replace med_hhd_inc_bg_acs_13_17 = med_hhd_inc_bg_acs_13_17 / 1000

	* merge to get homicide data  
	merge m:1 gidbg using  "${DataCleaned}/bg_homicide_cleaned.dta", keep(master match) nogenerate ///
		keepusing(homicide_count_13_16_avg km_to_nearest_homicide13 km_to_nearest_homicide14 km_to_nearest_homicide15 km_to_nearest_homicide16 homicide_count_16)

	egen km_to_nearest_homicide_min_13_16 = rowmin(km_to_nearest_homicide13 km_to_nearest_homicide14 km_to_nearest_homicide15 km_to_nearest_homicide16) 

	drop km_to_nearest_homicide13 km_to_nearest_homicide14 km_to_nearest_homicide15
	************************************
	* transform policing variables 
	************************************
	gen double asinh_hour_in_bg = asinh(hour_in_bg)
	gen double asinh_N_shifts_in_bg = asinh(N_shifts_in_bg)
	gen double ln_population_13_17 = log(tot_population_acs_13_17)

	************************************
	** Calculate Mean-Centered Values 
	************************************

	* Absolute Black 
	egen m_pct_nh_blk_alone_acs_13_17 = mean(pct_nh_blk_alone_acs_13_17)
	gen mc_pct_nh_blk_alone_acs_13_17 = pct_nh_blk_alone_acs_13_17 - m_pct_nh_blk_alone_acs_13_17

	* Police: Absolute share
	egen m_police_pct_black = mean(police_pct_black)
	egen m_police_pct_hisp = mean(police_pct_hisp)
	egen m_police_pct_asian = mean(police_pct_asian)
	egen m_police_pct_white = mean(police_pct_white)

	gen mc_police_pct_black = police_pct_black - m_police_pct_black
	gen mc_police_pct_hisp = police_pct_hisp - m_police_pct_hisp
	gen mc_police_pct_asian = police_pct_asian - m_police_pct_asian
	gen mc_police_pct_white = police_pct_white - m_police_pct_white

	egen m_sup_pct_black = mean(sup_pct_black)
	egen m_sup_pct_hisp = mean(sup_pct_hisp)
	egen m_sup_pct_asian = mean(sup_pct_asian)
	egen m_sup_pct_white = mean(sup_pct_white)

	gen mc_sup_pct_black = sup_pct_black - m_sup_pct_black
	gen mc_sup_pct_hisp = sup_pct_hisp - m_sup_pct_hisp
	gen mc_sup_pct_asian = sup_pct_asian - m_sup_pct_asian
	gen mc_sup_pct_white = sup_pct_white - m_sup_pct_white
	
	drop m_pct_nh_blk_alone_acs_13_17 m_police_pct_* m_sup_pct_*

	************************************
	* Calculate interaction terms
	************************************
	gen mc_pct_blk_X_police_blk = mc_pct_nh_blk_alone_acs_13_17 * mc_police_pct_black
	gen mc_pct_blk_X_sup_blk =  mc_pct_nh_blk_alone_acs_13_17 * mc_sup_pct_black

	************************************
	* label variables 
	************************************
	label variable hour_in_bg "Hour"
	label variable N_officers_in_bg "Number of officers"
	label variable N_shifts_in_bg "Number of shifts"
	label variable asinh_hour_in_bg "arsinh(Hour)"
	label variable asinh_N_shifts_in_bg "arsinh(Number of shifts)"
	
	label variable homicide_count_16 "Homicide Count 2016"
	label variable homicide_count_13_16_avg "Avg 13-16 Homicide Count"
	label variable km_to_nearest_homicide_min "Distance to nearest 13-16 homicide (km)"
	label variable km_to_nearest_homicide16 "Distance to nearest 2016 homicide (km)"

	label variable pct_hispanic_acs_13_17 "% Hispanic"
	label variable pct_nh_asian_alone_acs_13_17 "% Asian"
	label variable pct_nh_blk_alone_acs_13_17 "% Black"
	label variable pct_nh_white_alone_acs_13_17 "% White"

	label variable tot_population_acs_13_17 "Population"
	label variable ln_population_13_17 "Log Population"
	label variable pct_college_acs_13_17 "% College Graduates"
	label variable med_hhd_inc_bg_acs_13_17 "Median Household Income (1K)"
	label variable mail_return_rate_cen_2010 "Census Form Return Rate"

	label variable police_pct_black "Police: % Black"
	label variable police_pct_white "Police: % White"
	label variable police_pct_hisp "Police: % Hispanic"
	label variable police_pct_asian "Police: % Asian"

	label variable sup_pct_black "Supervisor: % Black"
	label variable sup_pct_white "Supervisor: % White"
	label variable sup_pct_hisp "Supervisor: % Hispanic"
	label variable sup_pct_asian "Supervisor: % Asian"

	label variable mc_pct_nh_blk_alone_acs_13_17 "% Black"
	label variable mc_police_pct_black "Police: % Black"
	label variable mc_police_pct_hisp "Police: % Hispanic"
	label variable mc_police_pct_asian "Police: % Asian"
	label variable mc_police_pct_white "Police: % White"

	label variable mc_sup_pct_black "Supervisor: % Black"
	label variable mc_sup_pct_hisp "Supervisor: % Hispanic"
	label variable mc_sup_pct_asian "Supervisor: % Asian"
	label variable mc_sup_pct_white "Supervisor: % White"

	label variable mc_pct_blk_X_police_blk "BG % Black X Police: % Black"
	label variable mc_pct_blk_X_sup_blk "BG % Black X Supervisor: % Black"
end 

* prepare main regression data
use "${DataCleaned}/patrol_time_in_bg.dta", clear
prepare_reg_data
save "${DataCleaned}/police_hour_data_for_reg.dta", replace

* prepare robustness check data 
use "${DataCleaned}/patrol_time_in_bg_nonwork.dta", clear
prepare_reg_data
save "${DataCleaned}/police_hour_data_for_reg_nonwork.dta", replace
